Wstęp do uczenia maszynowego - Projekt 1¶

Mikołaj Piórczyński, Michał Tomczyk, grupa 3¶

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split 

# Preprocessing
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import KBinsDiscretizer, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler

# Model 
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

# Feature Engineering 
from sklearn.feature_selection import VarianceThreshold, RFE

# Hyperparameters tuning 
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold

# Evaluation 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, balanced_accuracy_score
from sklearn.metrics import precision_recall_curve, roc_curve, auc, roc_auc_score  
from sklearn.model_selection import cross_val_score

# XAI
import dalex as dx
import shap
import warnings
warnings.filterwarnings("ignore")
In [2]:
df = pd.read_csv('data/census_income_dataset.csv')
df.head()
Out[2]:
age workclass fnlwgt education education_num marital_status occupation relationship race sex capital_gain capital_loss hours_per_week native_country income_level
0 39 State-gov 77516.0 Bachelors 13 Never-married Adm-clerical Not-in-family White Male 2174.0 0.0 40.0 United-States <=50K
1 50 Self-emp-not-inc 83311.0 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 0.0 0.0 13.0 United-States <=50K
2 38 Private 215646.0 HS-grad 9 Divorced Handlers-cleaners Not-in-family White Male 0.0 0.0 40.0 United-States <=50K
3 53 Private 234721.0 11th 7 Married-civ-spouse Handlers-cleaners Husband Black Male 0.0 0.0 40.0 United-States <=50K
4 28 Private 338409.0 Bachelors 13 Married-civ-spouse Prof-specialty Wife Black Female 0.0 0.0 40.0 Cuba <=50K

Braki danych¶

In [3]:
# zmienne kategoryczne
df.replace('?', np.nan, inplace=True)

# zmienne numeryczne
df.replace(-100000, np.nan, inplace=True)

Zmienna native_country ma kategorię, która ma znacznie więcej obserwacji niż inne kategorie, więc możemy tą wartością zastąpić braki danych.

In [4]:
df['native_country'].fillna(value="United-States", inplace=True)

W przypadku zmiennych workclass oraz occupation skorzystajmy z imputacji LOCF.

In [5]:
df['workclass'].fillna(method = 'ffill', inplace = True)
df['occupation'].fillna(method = 'ffill', inplace = True)

Dummy model¶

In [6]:
X = df.drop(columns=['income_level'])
y = df['income_level']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=123, stratify=y)

dc = DummyClassifier(strategy='most_frequent')
dc.fit(X_train, y_train)

preds = dc.predict(X_test)
print(f'Accuracy for dummy model: {accuracy_score(y_test, preds)}')
Accuracy for dummy model: 0.7606715119254785

Ewaluacja¶

In [6]:
def show_metrics(model, X, y):
    """
    Prints out the most important metrics
    """ 
    y_pred = model.predict(X)
    
    sns.heatmap(confusion_matrix(y, y_pred), annot=True)
    plt.show()
    
    print('Accuracy: ', accuracy_score(y, y_pred))
    print('Precision: ', precision_score(y, y_pred))
    print('Recall: ', recall_score(y, y_pred))
    print('F1 score: ', f1_score(y, y_pred))
In [30]:
def gini_train_val(model, X_train, y_train, X_val, y_val):
        
    y_pred_proba = model.predict_proba(X_train)[::,1]
    fpr, tpr, _ = roc_curve(y_train,  y_pred_proba)
    plt.plot(fpr, tpr, label='X_train')
    roc_auc = auc(fpr, tpr)
    gini_train = (2 * roc_auc) - 1
    
    y_pred_proba = model.predict_proba(X_val)[::,1]
    fpr, tpr, _ = roc_curve(y_val,  y_pred_proba)
    plt.plot(fpr, tpr, label='X_val')
    roc_auc = auc(fpr, tpr)
    gini_val = (2 * roc_auc) - 1

    plt.plot([0, 1], [0, 1], "--")
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.title("ROC Curve")
    plt.legend()
    plt.show()
    
    print("gini_train: %.4f" % gini_train)    
    print("gini_val: %.4f" % gini_val)

    return
In [8]:
def gini_compare_models(model1, model2, X1, y1, X2, y2):
    
    y_pred_proba = model1.predict_proba(X1)[::,1]
    fpr, tpr, _ = roc_curve(y1,  y_pred_proba)
    plt.plot(fpr, tpr, label='model1')
    roc_auc = auc(fpr, tpr)
    gini1 = (2 * roc_auc) - 1
    
    y_pred_proba = model2.predict_proba(X2)[::,1]
    fpr, tpr, _ = roc_curve(y2,  y_pred_proba)
    plt.plot(fpr, tpr, label='model2')
    roc_auc = auc(fpr, tpr)
    gini2 = (2 * roc_auc) - 1

    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.legend()
    plt.show()
    
    print("Gini for model 1: %.4f" % gini1)
    print("Gini for model 2: %.4f" % gini2)
    

Wybór zmiennych¶

Zmienna fnlwgt to przypisane do poszczególnych obserwacji wagi odzwierciedlającę wielkość populacji, z której próbkowane były poszczególne obserwacje. Zmienna ta nie będzie przydatna przy modelowaniu, dlatego usuwamy ją.

In [9]:
df.drop(columns=['fnlwgt'], inplace=True)

Wstępne modelowanie¶

In [11]:
X = df.drop(columns=['income_level', 'education'])
y = df['income_level'].map({'<=50K': 0, '>50K': 1})

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=123, stratify=y)

cat_features = ['workclass', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']

base_transformer = ColumnTransformer(
    [
        ('one_hot', OneHotEncoder(handle_unknown='ignore'), cat_features),
    ],
    remainder='passthrough'
)

base_models = [
    ('lr_model', LogisticRegression(random_state=123, n_jobs=-1)), 
    ('svm_model', SVC(random_state=123)),
    ('dt_model', DecisionTreeClassifier(random_state=123)),
    ('rf_model', RandomForestClassifier(random_state=123)),
    ('xgb_model', (XGBClassifier(random_state=123)))
]

kfolds = 5
split = KFold(n_splits=kfolds, shuffle=True, random_state=123)
for name, model in base_models:
    pipe = Pipeline(
        [
            ('transformer', base_transformer),
            ('model', model)
        ]
    )
    cv_results = cross_val_score(pipe, X_train, y_train, cv=split, scoring='accuracy', n_jobs=-1)
    print(f'{name} cross validation accuracy score: {cv_results.mean()} +/- {cv_results.std()} min: {cv_results.min()} max: {cv_results.max()}')
lr_model cross validation accuracy score: 0.8325952680041286 +/- 0.010626215579691066 min: 0.8170185540627 max: 0.8463211772232886
svm_model cross validation accuracy score: 0.8026257774118374 +/- 0.002088968710590899 min: 0.799334527770668 max: 0.8052463211772233
dt_model cross validation accuracy score: 0.8211552242877878 +/- 0.002345955530738737 min: 0.818914768364474 max: 0.8253358925143954
rf_model cross validation accuracy score: 0.8465434860179271 +/- 0.00468027513949076 min: 0.8383670335295623 max: 0.8510556621880998
xgb_model cross validation accuracy score: 0.8722647262218297 +/- 0.000840029306686614 min: 0.8708893154190659 max: 0.8733205374280231
In [10]:
def gini_score(y_true, y_score):
    return (2 * roc_auc_score(y_true, y_score)) - 1
In [72]:
test_results = []

X = df.drop(columns=['income_level', 'education'])
y = df['income_level'].map({'<=50K': 0, '>50K': 1})

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=123, stratify=y)

cat_features = ['workclass', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']

base_transformer = ColumnTransformer(
    [
        ('one_hot', OneHotEncoder(handle_unknown='ignore'), cat_features),
    ],
    remainder='passthrough'
)

base_models = [
    ('lr_model', LogisticRegression(random_state=123, n_jobs=-1)), 
    ('dt_model', DecisionTreeClassifier(random_state=123)),
    ('rf_model', RandomForestClassifier(random_state=123)),
    ('xgb_model', (XGBClassifier(random_state=123)))
]

for name, model in base_models:
    pipe = Pipeline(
        [
            ('transformer', base_transformer),
            ('model', model)
        ]
    )
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    y_pred_proba = pipe.predict_proba(X_test)[::, 1]
    measures_results = {"model": name}
    for measure in [accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score]:
        measures_results[measure.__name__] = measure(y_test, y_pred)
        print(f'{name} {measure.__name__}: {measure(y_test, y_pred)}')
    for measure in [gini_score, roc_auc_score]:
        measures_results[measure.__name__] = measure(y_test, y_pred_proba)
        print(f'{name} {measure.__name__}: {measure(y_test, y_pred_proba)}')
    test_results.append(measures_results)
    print('----------')
lr_model accuracy_score: 0.8382638959975433
lr_model balanced_accuracy_score: 0.7399342557171832
lr_model precision_score: 0.7082417582417583
lr_model recall_score: 0.5513259195893926
lr_model f1_score: 0.6200096200096199
lr_model gini_score: 0.7542867434287661
lr_model roc_auc_score: 0.8771433717143831
----------
dt_model accuracy_score: 0.8261848705087522
dt_model balanced_accuracy_score: 0.7488503585711672
dt_model precision_score: 0.6476014760147601
dt_model recall_score: 0.6005132591958939
dt_model f1_score: 0.6231691078561917
dt_model gini_score: 0.5484344190101829
dt_model roc_auc_score: 0.7742172095050914
----------
rf_model accuracy_score: 0.8483979936533934
rf_model balanced_accuracy_score: 0.7641842159155937
rf_model precision_score: 0.7185109637939827
rf_model recall_score: 0.6026518391787853
rf_model f1_score: 0.6555012793672947
rf_model gini_score: 0.7797038715693936
rf_model roc_auc_score: 0.8898519357846968
----------
[00:45:34] WARNING: D:\bld\xgboost-split_1637426510059\work\src\learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
xgb_model accuracy_score: 0.8736820554816256
xgb_model balanced_accuracy_score: 0.7941418621894569
xgb_model precision_score: 0.7911392405063291
xgb_model recall_score: 0.6415739948674081
xgb_model f1_score: 0.7085498346717053
xgb_model gini_score: 0.8557686518651952
xgb_model roc_auc_score: 0.9278843259325976
----------
In [85]:
test_results_df = pd.DataFrame(test_results)
test_results_df = pd.melt(test_results_df, id_vars="model", var_name="measure", value_name="value")
sns.catplot(x='measure', y='value', hue='model', data=test_results_df, kind='bar', aspect=3)
plt.show()

Kategoryzacja zmiennych ciągłych¶

In [12]:
X = df.drop(columns=['income_level', 'education'])
y = df['income_level'].map({'<=50K': 0, '>50K': 1})

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=123, stratify=y)

cat_features = ['workclass', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']

transformer = ColumnTransformer(
    [
        ('age_discretizer', KBinsDiscretizer(n_bins=15, encode='ordinal'), ['age']),
        ('discretizer', KBinsDiscretizer(n_bins=5,  encode='ordinal'), ['capital_loss', 'capital_gain', 'hours_per_week']),
        ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ],
    remainder='passthrough'
)

xgb_model = XGBClassifier(random_state=123, n_jobs=-1)

kfolds = 5
split = KFold(n_splits=kfolds, shuffle=True, random_state=123)
pipe = Pipeline(
    [
        ('transformer', transformer),
        ('model', xgb_model)
    ]
)
cv_results = cross_val_score(pipe, X_train, y_train, cv=split, scoring='accuracy', n_jobs=-1)
print(f'xgb_model cross validation accuracy score: {cv_results.mean()} +/- {cv_results.std()} min: {cv_results.min()} max: {cv_results.max()}')
xgb_model cross validation accuracy score: 0.8407851091950549 +/- 0.0018503125137839295 min: 0.8373432300998208 max: 0.8428662827895074

Skalowanie zmiennych ciągłych¶

In [13]:
X = df.drop(columns=['income_level', 'education'])
y = df['income_level'].map({'<=50K': 0, '>50K': 1})

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=123, stratify=y)

cat_features = ['workclass', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']

transformer = ColumnTransformer(
    [
        ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'), cat_features),
        ('log', FunctionTransformer(np.log1p), ['capital_loss']),
        ('std_scaler', StandardScaler(), ['age', 'capital_loss']),
        ('min_max_scaler', MinMaxScaler(), ['capital_gain', 'hours_per_week'])
    ],
    remainder='passthrough'
)

xgb_model = XGBClassifier(random_state=123, n_jobs=-1)

kfolds = 5
split = KFold(n_splits=kfolds, shuffle=True, random_state=123)
pipe = Pipeline(
    [
        ('transformer', transformer),
        ('model', xgb_model)
    ]
)
cv_results = cross_val_score(pipe, X_train, y_train, cv=split, scoring='accuracy', n_jobs=-1)
print(f'xgb_model cross validation accuracy score: {cv_results.mean()} +/- {cv_results.std()} min: {cv_results.min()} max: {cv_results.max()}')
xgb_model cross validation accuracy score: 0.8734164101017237 +/- 0.001380531779514911 min: 0.8714011516314779 max: 0.8754958413307742

Inżynieria cech¶

In [13]:
def group_marital_status(df: pd.DataFrame) -> pd.DataFrame:
    df_res = df.copy()
    df_res['marital_status'] = df_res['marital_status'].replace(
        ['Married-civ-spouse', 'Married-AF-spouse'], 'Married'
    )
    
    df_res['marital_status'] = df_res['marital_status'].replace(
        ['Never-married', 'Divorced', 'Separated', 'Widowed', 'Married-spouse-absent'], 'Living-alone'
    )
    
    return df_res  
In [15]:
df2 = group_marital_status(df)

X = df2.drop(columns=['income_level', 'education'])
y = df2['income_level'].map({'<=50K': 0, '>50K': 1})

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=123, stratify=y)

cat_features = ['workclass', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']

transformer = ColumnTransformer(
    [
        ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'), cat_features),
        ('std_scaler', StandardScaler(), ['age']),
        ('min_max_scaler', MinMaxScaler(), ['capital_gain', 'capital_loss', 'hours_per_week'])
    ],
    remainder='passthrough'
)

xgb_model = XGBClassifier(random_state=123, n_jobs=-1)

kfolds = 5
split = KFold(n_splits=kfolds, shuffle=True, random_state=123)
pipe = Pipeline(
    [
        ('transformer', transformer),
        ('model', xgb_model)
    ]
)
cv_results = cross_val_score(pipe, X_train, y_train, cv=split, scoring='accuracy', n_jobs=-1)
print(f'xgb_model cross validation accuracy score: {cv_results.mean()} +/- {cv_results.std()} min: {cv_results.min()} max: {cv_results.max()}')
xgb_model cross validation accuracy score: 0.8732116887172505 +/- 0.0021027070782437133 min: 0.8707613563659629 max: 0.8769033909149072
In [14]:
def create_living_alone(df: pd.DataFrame) -> pd.DataFrame:
    df_res = df.copy()
    df_res['living_alone'] = df_res['marital_status'].replace(
        ['Married-civ-spouse', 'Married-AF-spouse'], 'Married'
    )
    
    df_res['living_alone'] = df_res['living_alone'].replace(
        ['Never-married', 'Divorced', 'Separated', 'Widowed', 'Married-spouse-absent'], 'Living-alone'
    )
    
    return df_res  
In [17]:
df2 = create_living_alone(df)

X = df2.drop(columns=['income_level', 'education'])
y = df2['income_level'].map({'<=50K': 0, '>50K': 1})

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=123, stratify=y)

cat_features = ['workclass', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country', 'living_alone']

transformer = ColumnTransformer(
    [
        ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'), cat_features),
        ('std_scaler', StandardScaler(), ['age']),
        ('min_max_scaler', MinMaxScaler(), ['capital_gain', 'capital_loss', 'hours_per_week'])
    ],
    remainder='passthrough'
)

xgb_model = XGBClassifier(random_state=123, n_jobs=-1)

kfolds = 5
split = KFold(n_splits=kfolds, shuffle=True, random_state=123)
pipe = Pipeline(
    [
        ('transformer', transformer),
        ('model', xgb_model)
    ]
)
cv_results = cross_val_score(pipe, X_train, y_train, cv=split, scoring='accuracy', n_jobs=-1)
print(f'xgb_model cross validation accuracy score: {cv_results.mean()} +/- {cv_results.std()} min: {cv_results.min()} max: {cv_results.max()}')
xgb_model cross validation accuracy score: 0.8724950492422922 +/- 0.001137462079053026 min: 0.8710007678525723 max: 0.8742162507997441
In [15]:
def group_race(df: pd.DataFrame) -> pd.DataFrame:
    df_res = df.copy()
    df_res['marital_status'] = df_res['marital_status'].replace(
        ['Asian-Pac-Islander', 'Amer-Indian-Eskimo'], 'Other'
    )
    
    return df_res  
In [19]:
df2 = group_race(df)

X = df2.drop(columns=['income_level', 'education'])
y = df2['income_level'].map({'<=50K': 0, '>50K': 1})

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=123, stratify=y)

cat_features = ['workclass', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']

transformer = ColumnTransformer(
    [
        ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'), cat_features),
        ('std_scaler', StandardScaler(), ['age']),
        ('min_max_scaler', MinMaxScaler(), ['capital_gain', 'capital_loss', 'hours_per_week'])
    ],
    remainder='passthrough'
)

xgb_model = XGBClassifier(random_state=123, n_jobs=-1)

kfolds = 5
split = KFold(n_splits=kfolds, shuffle=True, random_state=123)
pipe = Pipeline(
    [
        ('transformer', transformer),
        ('model', xgb_model)
    ]
)
cv_results = cross_val_score(pipe, X_train, y_train, cv=split, scoring='accuracy', n_jobs=-1)
print(f'xgb_model cross validation accuracy score: {cv_results.mean()} +/- {cv_results.std()} min: {cv_results.min()} max: {cv_results.max()}')
xgb_model cross validation accuracy score: 0.8734164101017237 +/- 0.001380531779514911 min: 0.8714011516314779 max: 0.8754958413307742
In [16]:
def group_education(df: pd.DataFrame) -> pd.DataFrame:
    df_res = df.copy()
    df_res['education'] = df_res['education'].replace(
            ['Preschool', '1st-4th', '5th-6th', '7th-8th', '9th', '10th', '11th', '12th'], 'Obligatory'
    )
    df_res['education'] = df_res['education'].replace(['HS-grad', 'Some-college'], 'HS-college')
    df_res['education'] = df_res['education'].replace(['Assoc-voc', 'Assoc-acdm'], 'Assoc')
    df_res['education'] = df_res['education'].replace(['Prof-school', 'Doctorate'], 'Academic')
    return df_res
In [21]:
df2 = group_education(df)

X = df2.drop(columns=['income_level', 'education_num'])
y = df2['income_level'].map({'<=50K': 0, '>50K': 1})

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=123, stratify=y)

cat_features =  ['workclass', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']

transformer = ColumnTransformer(
    [
        ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'), cat_features),
        ('ordinal', OrdinalEncoder(), ['education']),
        ('std_scaler', StandardScaler(), ['age']),
        ('min_max_scaler', MinMaxScaler(), ['capital_gain', 'capital_loss', 'hours_per_week'])
    ],
    remainder='passthrough'
)

xgb_model = XGBClassifier(random_state=123, n_jobs=-1)

kfolds = 5
split = KFold(n_splits=kfolds, shuffle=True, random_state=123)
pipe = Pipeline(
    [
        ('transformer', transformer),
        ('model', xgb_model)
    ]
)
cv_results = cross_val_score(pipe, X_train, y_train, cv=split, scoring='accuracy', n_jobs=-1)
print(f'xgb_model cross validation accuracy score: {cv_results.mean()} +/- {cv_results.std()} min: {cv_results.min()} max: {cv_results.max()}')
xgb_model cross validation accuracy score: 0.872648580455278 +/- 0.0012443643478918457 min: 0.870744816995137 max: 0.8743442098528471
In [17]:
def group_countries(df: pd.DataFrame) -> pd.DataFrame:
    df_res = df.copy()
    df_res['native_country'] = df_res['native_country'].apply(lambda x: x if x == "United-States" or x == "Mexico" else "Other")
    return df_res
In [23]:
df2 = group_countries(df)

X = df2.drop(columns=['income_level', 'education'])
y = df2['income_level'].map({'<=50K': 0, '>50K': 1})

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=123, stratify=y)

cat_features =  ['workclass', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']

transformer = ColumnTransformer(
    [
        ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'), cat_features),
        ('std_scaler', StandardScaler(), ['age']),
        ('min_max_scaler', MinMaxScaler(), ['capital_gain', 'capital_loss', 'hours_per_week'])
    ],
    remainder='passthrough'
)

xgb_model = XGBClassifier(random_state=123, n_jobs=-1)

kfolds = 5
split = KFold(n_splits=kfolds, shuffle=True, random_state=123)
pipe = Pipeline(
    [
        ('transformer', transformer),
        ('model', xgb_model)
    ]
)
cv_results = cross_val_score(pipe, X_train, y_train, cv=split, scoring='accuracy', n_jobs=-1)
print(f'xgb_model cross validation accuracy score: {cv_results.mean()} +/- {cv_results.std()} min: {cv_results.min()} max: {cv_results.max()}')
xgb_model cross validation accuracy score: 0.8724438295946986 +/- 0.0014491703431791297 min: 0.870744816995137 max: 0.8747280870121561
In [18]:
def group_gdp(df: pd.DataFrame, gdp: pd.DataFrame) -> pd.DataFrame:
    df_res = df.copy()
    low = gdp[gdp['Country Name'] == 'Low income']['gdp'].values[0]
    low_middle = gdp[gdp['Country Name'] == 'Lower middle income']['gdp'].values[0]
    upper_middle = gdp[gdp['Country Name'] == 'Upper middle income']['gdp'].values[0]
    high = gdp[gdp['Country Name'] == 'High income']['gdp'].values[0]

    conditions = [
        (df_res["gdp"].lt(low)),
        (df_res["gdp"].ge(low) & df_res["gdp"].lt(low_middle)),
        (df_res["gdp"].ge(low_middle) & df_res["gdp"].lt(upper_middle)),
        (df_res["gdp"].ge(upper_middle) & df_res["gdp"].lt(high)),
        (df_res["gdp"].ge(high)),
    ]
    choices = ["Low income", "Lower middle income", "Middle income", "Upper middle income", "High income"]

    df_res["gdp_level"] = np.select(conditions, choices)
    return df_res
In [25]:
gdp = pd.read_csv('data/gdp.csv')

merged = df.merge(gdp, how='left', left_on='native_country', right_on='Country Name')
merged.drop(columns=['Country Name', 'Country Code'], inplace=True)
merged.fillna(method='ffill', inplace=True)

df2 = group_gdp(merged, gdp)

X = df2.drop(columns=['income_level', 'gdp', 'education'])
y = df2['income_level'].map({'<=50K': 0, '>50K': 1})

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=123, stratify=y)

cat_features =  ['workclass', 'marital_status', 'gdp_level', 'occupation', 'relationship', 'race', 'sex', 'native_country']

transformer = ColumnTransformer(
    [
        ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'), cat_features),
        ('std_scaler', StandardScaler(), ['age']),
        ('min_max_scaler', MinMaxScaler(), ['capital_gain', 'capital_loss', 'hours_per_week'])
    ],
    remainder='passthrough'
)

xgb_model = XGBClassifier(random_state=123, n_jobs=-1)

kfolds = 5
split = KFold(n_splits=kfolds, shuffle=True, random_state=123)
pipe = Pipeline(
    [
        ('transformer', transformer),
        ('model', xgb_model)
    ]
)
cv_results = cross_val_score(pipe, X_train, y_train, cv=split, scoring='accuracy', n_jobs=-1)
print(f'xgb_model cross validation accuracy score: {cv_results.mean()} +/- {cv_results.std()} min: {cv_results.min()} max: {cv_results.max()}')
xgb_model cross validation accuracy score: 0.8725205984763147 +/- 0.001852653585169723 min: 0.870744816995137 max: 0.8754958413307742

Strojenie hiperparametrów¶

In [19]:
gdp = pd.read_csv('data/gdp.csv')

merged = df.merge(gdp, how='left', left_on='native_country', right_on='Country Name')
merged.drop(columns=['Country Name', 'Country Code'], inplace=True)
merged.fillna(method='ffill', inplace=True)

df2 = group_gdp(merged, gdp)
df2 = group_countries(df2)
df2 = group_education(df2)
df2 = group_race(df2)
df2 = create_living_alone(df2)

X = df2.drop(columns=['income_level', 'gdp', 'education_num'])
y = df2['income_level'].map({'<=50K': 0, '>50K': 1})

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=123, stratify=y)

cat_features = ['workclass', 'marital_status', 'living_alone', 'gdp_level', 'occupation', 'relationship', 'race', 'sex', 'native_country']

transformer = ColumnTransformer(
    [
        ('log', FunctionTransformer(np.log1p), ['capital_loss']),
        ('std_scaler', StandardScaler(), ['age', 'capital_loss']),
        ('min_max_scaler', MinMaxScaler(), ['capital_gain', 'hours_per_week']),
        ('ordinal', OrdinalEncoder(), ['education']),
        ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ],
    remainder='passthrough'
)

xgb_model = XGBClassifier(random_state=123, n_jobs=-1)

kfolds = 5
split = KFold(n_splits=kfolds, shuffle=True, random_state=123)
pipe = Pipeline(
    [
        ('transformer', transformer),
        ('model', xgb_model)
    ]
)
cv_results = cross_val_score(pipe, X_train, y_train, cv=split, scoring='accuracy', n_jobs=-1)
print(f'xgb_model cross validation accuracy score: {cv_results.mean()} +/- {cv_results.std()} min: {cv_results.min()} max: {cv_results.max()}')
xgb_model cross validation accuracy score: 0.8729301067477193 +/- 0.0013467930057077492 min: 0.8708727924238546 max: 0.8751119641714651
In [20]:
params_distr = {
    "model__n_estimators": [50, 100, 200],
    "model__max_depth": [2, 3, 5, 10, 15],
    "model__learning_rate": [0.5, 0.3, 0.1, 0.01],
}

search = RandomizedSearchCV(estimator=pipe, param_distributions=params_distr, verbose=True, n_jobs=-1)
search.fit(X, y)
Fitting 5 folds for each of 10 candidates, totalling 50 fits
[10:43:18] WARNING: D:\bld\xgboost-split_1637426510059\work\src\learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
Out[20]:
RandomizedSearchCV(estimator=Pipeline(steps=[('transformer',
                                              ColumnTransformer(remainder='passthrough',
                                                                transformers=[('log',
                                                                               FunctionTransformer(func=<ufunc 'log1p'>),
                                                                               ['capital_loss']),
                                                                              ('std_scaler',
                                                                               StandardScaler(),
                                                                               ['age',
                                                                                'capital_loss']),
                                                                              ('min_max_scaler',
                                                                               MinMaxScaler(),
                                                                               ['capital_gain',
                                                                                'hours_per_week']),
                                                                              ('ordinal',
                                                                               OrdinalEncoder(),
                                                                               ['education']),
                                                                              ('...
                                                            n_estimators=100,
                                                            n_jobs=-1,
                                                            num_parallel_tree=None,
                                                            predictor=None,
                                                            random_state=123,
                                                            reg_alpha=None,
                                                            reg_lambda=None,
                                                            scale_pos_weight=None,
                                                            subsample=None,
                                                            tree_method=None,
                                                            validate_parameters=None,
                                                            verbosity=None))]),
                   n_jobs=-1,
                   param_distributions={'model__learning_rate': [0.5, 0.3, 0.1,
                                                                 0.01],
                                        'model__max_depth': [2, 3, 5, 10, 15],
                                        'model__n_estimators': [50, 100, 200]},
                   verbose=True)
In [21]:
best_random = search.best_estimator_
print(best_random) 
Pipeline(steps=[('transformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('log',
                                                  FunctionTransformer(func=<ufunc 'log1p'>),
                                                  ['capital_loss']),
                                                 ('std_scaler',
                                                  StandardScaler(),
                                                  ['age', 'capital_loss']),
                                                 ('min_max_scaler',
                                                  MinMaxScaler(),
                                                  ['capital_gain',
                                                   'hours_per_week']),
                                                 ('ordinal', OrdinalEncoder(),
                                                  ['education']),
                                                 ('one_hot_encoder',
                                                  OneHotEncode...
                               gamma=0, gpu_id=-1, importance_type=None,
                               interaction_constraints='', learning_rate=0.5,
                               max_delta_step=0, max_depth=3,
                               min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=200,
                               n_jobs=-1, num_parallel_tree=1, predictor='auto',
                               random_state=123, reg_alpha=0, reg_lambda=1,
                               scale_pos_weight=1, subsample=1,
                               tree_method='exact', validate_parameters=1,
                               verbosity=None))])
In [22]:
print(search.best_params_)
{'model__n_estimators': 200, 'model__max_depth': 3, 'model__learning_rate': 0.5}
In [23]:
show_metrics(best_random, X_test, y_test)
Accuracy:  0.8822806837956803
Precision:  0.8146186440677966
Recall:  0.6578272027373824
F1 score:  0.7278750591575959
In [31]:
gini_train_val(best_random, X_train, y_train, X_test, y_test)
gini_train: 0.8791
gini_val: 0.8771

XAI¶

In [25]:
explainer = dx.Explainer(best_random, X_train, y_train)
Preparation of a new explainer is initiated

  -> data              : 39073 rows 14 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 39073 values
  -> model_class       : xgboost.sklearn.XGBClassifier (default)
  -> label             : Not specified, model's class short name will be used. (default)
  -> predict function  : <function yhat_proba_default at 0x000001C77059FB80> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 1.15e-07, mean = 0.24, max = 1.0
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.975, mean = -0.000513, max = 0.999
  -> model_info        : package sklearn

A new explainer has been created!
In [26]:
explainer.model_parts().plot()
In [27]:
explainer.model_profile().plot()
Calculating ceteris paribus: 100%|█████████████████████████████████████████████████████| 14/14 [00:03<00:00,  4.09it/s]
In [34]:
explainer.model_profile(variables=['age']).plot(geom='profiles')
Calculating ceteris paribus: 100%|███████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.19it/s]
In [28]:
explainer.model_profile(variable_type = 'categorical', variables=['living_alone', 'education']).plot()
Calculating ceteris paribus: 100%|███████████████████████████████████████████████████████| 2/2 [00:00<00:00, 11.00it/s]
In [37]:
best_random.predict(X_test.iloc[[24]])
Out[37]:
array([0], dtype=int64)
In [38]:
y_test.iloc[[24]]
Out[38]:
43257    0
Name: income_level, dtype: int64
In [35]:
explainer.predict_parts(X_test.iloc[[24]]).plot()
In [39]:
best_random.predict(X_test.iloc[[23]])
Out[39]:
array([1], dtype=int64)
In [40]:
y_test.iloc[[23]]
Out[40]:
17318    1
Name: income_level, dtype: int64
In [29]:
explainer.predict_parts(X_test.iloc[[23]]).plot()
In [112]:
X_train[cat_features].apply(lambda x: len(x.unique()))
Out[112]:
workclass          8
marital_status     7
living_alone       2
gdp_level          4
occupation        14
relationship       6
race               5
sex                2
native_country     3
dtype: int64

Ewaluacja¶

In [35]:
X = df.drop(columns=['income_level', 'education'])
y = df['income_level'].map({'<=50K': 0, '>50K': 1})

base_X_train, base_X_test, base_y_train, base_y_test = train_test_split(X, y, train_size=0.8, random_state=123, stratify=y)

cat_features = ['workclass', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']

base_transformer = ColumnTransformer(
    [
        ('one_hot', OneHotEncoder(handle_unknown='ignore'), cat_features),
    ],
    remainder='passthrough'
)

xgb_model = XGBClassifier(random_state=123, n_jobs=-1)

base_pipe = Pipeline(
    [
        ('transformer', base_transformer),
        ('model', xgb_model)
    ]
)

base_pipe.fit(base_X_train, base_y_train)
[23:14:46] WARNING: D:\bld\xgboost-split_1637426510059\work\src\learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
Out[35]:
Pipeline(steps=[('transformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('one_hot',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['workclass',
                                                   'marital_status',
                                                   'occupation', 'relationship',
                                                   'race', 'sex',
                                                   'native_country'])])),
                ('model',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, enable_...
                               gamma=0, gpu_id=-1, importance_type=None,
                               interaction_constraints='',
                               learning_rate=0.300000012, max_delta_step=0,
                               max_depth=6, min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=100,
                               n_jobs=-1, num_parallel_tree=1, predictor='auto',
                               random_state=123, reg_alpha=0, reg_lambda=1,
                               scale_pos_weight=1, subsample=1,
                               tree_method='exact', validate_parameters=1,
                               verbosity=None))])
In [36]:
gini_compare_models(best_random, base_pipe, X_test, y_test, base_X_test, base_y_test)
Gini for model 1: 0.8734
Gini for model 2: 0.8558

Podsumowanie¶

Do naszego zbioru danych najlepszym modelem okazał się być XGBoost. Aby sprawdzić jak możemy poprawić osiągniętą dokładność modelu (w wersji bazowej 0.872) dokonaliśmy operacji kategoryzacji i skalowania zmiennych ciągłych i wykonaliśmy feature engineering. Nie przyniosły one jednak istotnych zmian, wskaźnik accuracy zmieniał się na co najwyżej 3. miejscu po przecinku. Wykonaliśmy także random search, które pozwoliło nam wybrać odpowiednie hiperparametry modelu. Aby lepiej zobaczyć działanie modelu skorzystaliśmy też z pakietu XAI.